               /*******************************************/
               /*          Continuous Semi-Hash            */
               /*          Author: Zeming Xu              */
               /*          Date:   Jan, 24, 2007          */
               /*          Copyright: Lenovo Corporation     */
               /*******************************************/

/*-----------------------INCLUDES--------------------------------------------*/

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

/*-----------------------DATA STRUCTURES-------------------------------------*/

// same sub-string
struct sameSubString{
  unsigned long int compressedVersionPosition;
  unsigned long int referenceVersionPosition;
  unsigned long int strLength;
};

/*-----------------------GLOBAL VARIABLES & MACROS---------------------------*/

// blockSize is the size(in bytes) of the block to generate a semi-hash value;
// blockSize: 4k, 2K,
// 1024, 512, 256, 128, 64, 32
// 1021, 509, 251, 127, 61, 31
/*1024, 256*/
/*512, 128*/
/*256, 64*/
/*128, 32*/
/*64, 16*/
#define BLOCKSIZE 1024
#define BLOCKSIZEQUARTER 256

// hashValueLength, the size(in bytes) of hash values.
// hashValueLength: 4, 2 Bytes;
// Fix hashValueLength to be 4 bytes!
#define HASHVALUELENGTH 4

// spanLength, i.e. the rolling distance, is the span distance(in bytes) between two neighbouring block heads to generate two semi-hash values;
// spanLength: 4, 3, 2, 1 Bytes.
#define SPANLENGTH 1

// subscript length reflects the length of buffers.
// SUBSCRIPTLENGTH can be stored in 2(or 3) bytes.
// The remained 1(or 2) can be used to store other informations, such as repetion number.
#define SUBSCRIPTLENGTH 4

// The tested files:
// "./bibliographie.doc" is the reference file;
// "./bibliographieRI.doc" is the identical file;
#define FILENUMBER 2
char * testedFile[FILENUMBER] = {\
				 "./bibliographie.doc",\
				 "./bibliographieR.doc"
};

// ----- There are 3 file descriptors needed in the program:

// file descriptor for reference file
int fd;
off_t fileSize;
// file descriptor for the file to be compressed
int fdNew;
off_t fileSizeNew;
// file descriptor for file to be recovered
int fdRecovered;
off_t fileSizeRecovered;

struct stat statFile;

// CSHVnumber is total of CSHV generated by the reference file
/*CSHV, continuous semi-hash value*/
// CSHVnumber does not take into  count the repetitions.
unsigned long int CSHVnumber = 0;

// cshvNumberDiff does take into  count the repetitions.
unsigned long int cshvNumberDiff = 0;

// We choose many CSHVs as landmarks, and use LandmarkNumber
// to indicate the total of landmarks
// as well as the size of bufPtrLandmarks,
// not in bytes, but in (HASHVALUELENGTH + SUBSCRIPTLENGTH) bytes.
unsigned long int LandmarkNumber;

// ----- There are 6 buffers needed in the program:

// buffer to store the original file
char * bufPtrFile = NULL;
size_t bufPtrFileSize = 0;

// buffer to store the new file, i.e., the file to be compressed
char * bufPtrFileNew = NULL;
size_t bufPtrFileNewSize = 0;

// buffer to store the CSHVs and corresponding subscripts
char * bufPtrSemiHashValue = NULL;
size_t bufPtrSemiHashValueSize = 0;

// buffer to store landmarks of CSHVs
char * bufPtrLandmarks = NULL;
size_t bufPtrLandmarksSize = 0;

// buffer to store the information of the same substring between the 2 files.
/*SSS, same sub-string*/
// SSS includes 3 parts:
//          sss.compressedVersionPosition;
//          sss.referenceVersionPosition;
//          sss.strLength;
char * bufPtrSameSubString = NULL;
size_t bufPtrSameSubStringSize = 0;

// We should record the total of same sub-strings
// sssNumber is the total of (struct sameSubString).
unsigned long int sssNumber;

// buffer to store the transcript template
/*TT, transcript template, i.e., compressed increment between two files.*/
// TT includes two parts:
// 1. identical part. This part consists of
//          unsigned char ttTag = 0;
//          unsigned long int strLength;
//          unsigned long int refStartSubscript;
// 2. peculiar part. This part consists of
//          unsigned char ttTag = 1;
//          unsigned long int strLength;
//          unsigned char buf[strLength];
char * bufPtrTranscriptTemplate = NULL;
size_t bufPtrTranscriptTemplateSize = 0;

// ttNumber is to recorder the total of identical and peculiar parts.
unsigned long int ttNumber;

// buffer to store the new file recovered from TT and referece file
char * bufPtrFileRecovered = NULL;
size_t bufPtrFileRecoveredSize = 0;

// statistics of semi-hash value and its frequency:
// ----- we records the top 5 frequentest semi-hash values:
unsigned long int shvFrequentest1;
unsigned long int shvFrequentest2;
unsigned long int shvFrequentest3;
unsigned long int shvFrequentest4;
unsigned long int shvFrequentest5;
// ----- and their respective frequencies;
unsigned long int frequencyTop1;
unsigned long int frequencyTop2;
unsigned long int frequencyTop3;
unsigned long int frequencyTop4;
unsigned long int frequencyTop5;
// ----- and records the SHV occurences 
//    with freqency 1, 2, 3, 4, 5.
unsigned long int occurrenceOnce;
unsigned long int occurrenceTwice;
unsigned long int occurrence3Times;
unsigned long int occurrence4Times;
unsigned long int occurrence5Times;
// ----- the following counter to record the repetition of SHVs.
// cshvNumberDiff + hashValueRepetitionCounter == CSHVnumber
unsigned long int hashValueRepetitionCounter = 0;

// When constructing landmarks, we use SHVs that occurs least.
// We only deal with the following cases:
// 1. SHVs that occurs once;
// 2. SHVs that occurs once or twice;
// 3. SHVs that are not the frequentest ones.
unsigned short int landmarkSHVoccurence;

// When comparing the reference and the new files,
// we need 4 subscripts to refer to corresponding positions,
// at the same time we should know the lengths of the 2 sub-strings.
// After comparing,
// at headSubscriptRefFile and headSubscriptNewFile, the corresponding 
// characters are different; so are at tailSubscriptRefFile and  
// tailSubscriptNewFile.
unsigned long int headSubscriptRefFile;
unsigned long int headSubscriptNewFile;
unsigned long int tailSubscriptRefFile;
unsigned long int tailSubscriptNewFile;
unsigned long int headSSSlength;
unsigned long int tailSSSlength;

// When constructing SSSs,
// We need scour bufPtrFileNew bi-directionally,
// hence use the 2 indicators below.
unsigned long int forwardBufPtrFileNewSubscript;
unsigned long int backwardBufPtrFileNewSubscript;

// During binary search in bufPtrLandmarks,
// if find the same SHV,  the corresponding subscripts,
// which are stored in global array sameSHVsubscript[5];
// and the total of same SHVs are in sameSHVnumber,
// which is limited within 5.
unsigned long int sameSHVsubscript[5];
unsigned short int sameSHVnumber;

// The following variable is used in the ratchet algorithm
// to expand same sub-string:
// propellingSubscript pushes SSSs inserted from left to right.
// Notice: At the subscript propellingSubscript, 
// the correponding characters are the same;
// as recorded in the last inserted SSS.
unsigned long int propellingSubscript;

// When we find several(at most 5) same sub-strings in reference version,
// for a sub-string in the compressed version,
// after we expand the same sub-strings,
// we have to tackle the border overlapping of the sub-strings,
// hence, we store the expansions into the global variables below.
struct sameSubString sssOverlapping[5];
unsigned short int sssOverlappingNumber;

// jumpDistance is used to jump a distance when find
// a pair of identical sub-strings, and expand them bothway;
unsigned long int jumpDistance;

/*------------------------- FUNCTIONS----------------------------------------*/

/*====================AUXILIARY FUNCTIONS========================*/

unsigned short int isBigEndian();

void outputFileInfo(unsigned short int fileNo);

void swapTwo8Bytes(\
		   void * ptr1,\
		   void * ptr2);

// retSHV is OUT.
void calcCurrentSHV(\
		    char * ptrWhichFile,\
		    unsigned long int startSubscript,\
		    unsigned long int * retSHV);

// retSHV is IN and OUT.
void calcNextSHV(\
		 char * ptrWhichFile,\
		 unsigned long int startSubscript,\
		 unsigned long int * retSHV);

/*====================STEP 1=====================================*/
/*GENERATE THE CONTINUOUS SEMI-HASH VALUES OF THE REFERENCE FILE.*/

// open the file of reference version
void openFileReference();

// calculate the interation time of semi-hash operation
// i.e., the number of semi-hash values
unsigned int calcCSHVNumber();

// memory allocation for semi-hash values;
// memory size:
// CSHVnumber*(HASHVALUELENGTH+SUBSCRIPTLENGTH)
int mallocBufPtrSemiHashValue();

// close the file of reference version
void closeFileReference();

// insert iterationNumber SHVs into ptrBufSHVs;
// One SHV is 4 bytes from another neighbouring SHV.
// And originalSHV is the original SHV.
void contiSHV(\
	      char * ptrBufFileStartingPoint,\
	      char * ptrBufSHVsStartingPoint,\
	      unsigned long int startSubscript,\
	      unsigned long int originalSHV,\
	      unsigned long int iterationNumber);

// continuous semi-hash
// 01: bitwise XOR
// spanlength is 1 byte, i.e., rolling distance.
int contiSemiHash01();

// Sort buffer of semi-hash values with min-heap sorting
// J. Willioms --- 1964
// In C language, the subscripts is between 0 and (array length - 1).
// But the subscripts for the binary tree in heap sorting,
// it's needed to use subscripts between 0 and (array length).
// pos : floor(n/2)...1. 
// here n is the global variable, CSHVnumber.
void minHeapSieve(unsigned long int pos,unsigned long int posEnd);

void heapSortBufPtrSemiHashValue();

// The following functions is to calculate the statistics of SHV repetition.

void modifyLeastOccurence(\
			  unsigned long int frequency);

void shiftFrequentestSHV(\
			 unsigned short int shiftPos);

void changeFrequentestSHV(\
			  unsigned long int frequency,\
			  unsigned long int semiHashValue);

void statisticsSHV(\
		   unsigned long int frequency,\
		   unsigned long int semiHashValue);

void outputSHVstatistics();

int calcHashValueRepetition();

/*==========================STEP 2===============================*/
/*======================OBTAIN LANDMARK==========================*/

// SHVs that repeats 1 or times consist of the main part of SHVs.
// We use the peculiar SHVs to construct the landmarks.

void determineLandmarkSHVoccrence();

void mallocBufPtrLandmarks();

void genLandmarks1();

void genLandmarks2();

// if we include into landmarks all the SHVs 
// whose frequency is not more than frequencyTop1/100,
// then we must allocate memory dynamically with realloc.
// To simplify the program, we only include SHVs
// with frequency 1, 2, 3, 4, or 5.
void genLandmarks3();

// If the total of SHVs with frequency 1, 2, 3, 4, or 5 are
// less than (CSHVnumber / 2), then use bufPtrSemiHashValue 
// as bufPtrLandmarks; and search a SHV backward or forward at most
// 5 times in bufPtrSemiHashValue.
void genLandmarks4();

// free the memory for bufPtrSemiHashValue, if possible.
void substitueLandmarksForBufSHV();

void generateLandmarks();

/*==========================STEP 3===============================*/
/*====================OBTAIN SAME SUBSTRINGS=====================*/

// open the file of new version
void openFileNew();

// close the file of new version
void closeFileNew();

// compare two files from two directions simutaneously.
void cmpFilesBiDirection();

// after tackling the border overlapping, 
// insert sss at the end of sss-list.
void appendListSSS(\
		   struct sameSubString sss);

// insert the first same sub-string
void insertFirstSSS();

// If find the same SHV in bufPtrLandmarks at ptrLi,
// then search the SHV backward.
// The total of same SHVs is limited within 5.
void searchSHVbackward(\
		       unsigned long int shv,\
		       unsigned long int * ptrLi);

// If find the same SHV in bufPtrLandmarks at ptrLi,
// then search the SHV forward.
// The total of same SHVs is limited within 5.
void searchSHVforward(\
		      unsigned long int shv,\
		      unsigned long int * ptrLi);

// binary search in bufPtrLandmarks
// If find the same SHV, return the corresponding subscripts,
// which are stored in global array sameSHVsubscript[5];
// the global short int sameSHVnumber returns the number of same SHVs.
// the number is limited within 5.
unsigned short int binarySearchSHV(\
		     unsigned long int shv);

// if two substrings of size BLOCKSIZE are identical, return 1;
// else return 0.
unsigned short int identicalSubStrings(\
				       unsigned char * ptrRef,\
				       unsigned char * ptrCmpr);

// if two substrings of arbitrary size are identical, return 1;
// else return 0.
unsigned char identicalPlateau(\
			       unsigned long int compressedVersionPosition,\
			       unsigned long int referenceVersionPosition,\
			       unsigned long int strLength);

// Having found 2 identical sub-strings of length BLOCKSIZE,
// expand the same sub-string bothway.
// Parameter: sss is IN and OUT.
// We use a ratchet algorithm here,
// i.e., 
// we set a global variable, propellingSubscript
// to indicate the last subscript of the compressed version,
// which equals the last inserted sameSubString's
// compressedVersionPosition plus strLength.
// Hence at propellingSubscript, the correponding byte
// has been found an identical one.
void expandSSS(\
	       struct sameSubString * sss);

// Here we tackle the border overlapping of SSSs 
// according to the same sub-string of the compressed version.
// Because at most 5 identical SHVs can be found,
// hence, we can store all the expansions
// (which have the same middle part) into 
// the global variables, sssOverlapping[5] and sssOverlappingNumber;
// then select the foremost expasion as the 1st inserted SSS,
// denote it from sssOverlapping[i];
// cut the remaining part (if it exists) from
// the tail of sssOverlapping[i] to the hindermost expansion
// as the 2nd inserted SSS.
// The first step of tackleBorderOverlapping is to
// tag the foremost and hindermost expansion of sssOverlapping.
// In tackleBorderOverlapping,
// insert related SSSs according to the fore SSS and hinder SSS,
// using appendListSSS().
void tackleBorderOverlapping();

// When find several same SHVs,
// check whether there are same sub-strings.
// If there are SSSs in the global array, sameSHVsubscript,
// then determine whether they are identical sub-strings;
// and expandSSS, tackleBorderOverlapping, appendListSSS.
// Before exiting the function, if needed,
// change forwardBufPtrFileNewSubscript.
// If there is insertion of SSS,
// forwardBufPtrFileNewSubscript should be increased
// at least BLOCKSIZE.
void checkSSSs();

// find and insert all the same sub-strings 
// except for the first one and the last one.
void findInsertSSSs();

// insert the last same sub-string
void insertLastSSS();

// generate buffer of same sub-strings
// It's recommended to delete
// insertFirstSSS(), and insertLastSSS();
// only use findInsertSSSs() to generateSSS().
void generateSSS();

/*==========================STEP 4===============================*/
/*====================OBTAIN INSERTED SCRAPS=====================*/

// generate scraps
void generateScraps();

/*==========================STEP 5===============================*/
/*====================OBTAIN TRANSCRIPT TEMPLATE=================*/

// append identical sub-strings to TT;
// the TT tag is 0.
void appendTTidentical(\
		       unsigned long int strLength,\
		       unsigned long int refStartSubscript);

// append peculiar sub-string to TT;
// the TT tag is 1.
void appendTTpeculiar(\
		      unsigned long int strLength,\
		      unsigned long int cmprStartSubscript);

// generate transcript template
void generateTT();

// output the delta information
void outputDeltaSize();

/*==========================STEP 6===============================*/
/*====================RECOVER NEW FILE===========================*/

// scour TT and obtain the length of the file to be recovered.
unsigned long int obtainFileLength();

// write recovered file buffer - identical part
void writeBufRecovFileIdentical(\
				unsigned long int curBufRecovFileSubscript,\
				unsigned long int refStartSubscript,\
				unsigned long int strLength);

// write recovered file buffer - peculiar part
void writeBufRecovFilePeculiar(\
			       unsigned long int curBufRecovFileSubscript,\
			       unsigned long int fromTTSubscript,\
			       unsigned long int strLength);

// recover the file of new version
void recoverFileNew();

// free memory allocated
void freeMemoryAllocated();

/*------------------------------------END------------------------------------*/

void test();

/*------------------------------------OK-------------------------------------*/
